% Construct Employment Data for each year from Compustat data

clear all;
small = 1.0e-10;
pinv_tol = 1.0e-05;
big = 1.0e+8;

% -- File Directories   
outdir = 'out/';
figdir = 'fig/';
matdir = 'mat/';
datadir = '../Data/FirmSize/';

% Read in Compustat Data
str_data = [datadir 'compustat_raw_20231117.xlsx'];
Comp_dat = readtable(str_data);
fy = Comp_dat.DataYear_Fiscal;
emp = Comp_dat.Employees;
Country_Code = Comp_dat.CurrentISOCountryCode_Incorporation;
Company_Name = Comp_dat.CompanyName;
% Make Sure all firms are incorporated in US 
index_USA = find(strcmp(Country_Code,'USA'));
fy = fy(index_USA);
emp = emp(index_USA);
Company_Name = Company_Name(index_USA);

% Delete United States Steel as this is also USX Corp-Consilidated 
index_USSteel = find(strcmp(Company_Name,'UNITED STATES STEEL CORP'));
fy(index_USSteel) = [];
emp(index_USSteel) = [];
Company_Name(index_USSteel) = [];

fy_first = 1950;
fy_last = 2022;
calvec = (fy_first:fy_last)';

% Construct matrix of N_Largest employment values for each year

N_largest = 100;

for t = fy_first:fy_last
    ii = find(fy == t);
    emp_ii = emp(ii);
    Company_Name_ii = Company_Name(ii);
    jj = 1;
    while jj > 0
      [~,isort] = sort(emp_ii,'descend');
      Names_to_check = Company_Name_ii(isort(1:N_largest));
         % Check First N_largest names for repeats
         jj = 0;
         for j = 1:N_largest
          tmp = Names_to_check(j);
          ind_j = find(strcmp(Company_Name_ii,tmp));
          if length(ind_j) > 1
              % delete all instances except first
              Company_Name_ii(ind_j(2:end)) = [];
              emp_ii(ind_j(2:end)) = [];
              jj = jj+1;
              fprintf('jj %d  ',jj);
          end
         end
    end
    fprintf('\n');
    if t == fy_first
          emp_largest = emp_ii(isort(1:N_largest));
          Company_Name_largest = Company_Name_ii(isort(1:N_largest));
      else
        emp_largest = [emp_largest emp_ii(isort(1:N_largest))];
        Company_Name_largest = [Company_Name_largest Company_Name_ii(isort(1:N_largest))];
    end 
    fprintf('Year %d %d \n',[t length(emp_ii)]);
end

% Read in BEA employment data from 1950 to 2022
str = [datadir 'USPRIV.xlsx'];
emp_aggregate = readmatrix(str,'Range','B23:B95');

% Normalize the employment data
emp_largest_normalized = 100*emp_largest./repmat(emp_aggregate',N_largest,1);

% Save Data to an Excel File where columns are years and rows are firms
str_data = [datadir 'compustat_top_' num2str(N_largest) '.xlsx'];
writetable(table({'Rank'}),str_data,'WriteVariableNames',false,'Range','A1');
writetable(table((1:N_largest)'),str_data,'WriteVariableNames',false,'Range','A2');
for i = 1: length(calvec)
    xcol1 = num2xlcol(4*i-1);
    xcol2 = num2xlcol(4*i-0);
    xcol3 = num2xlcol(4*i+1);
    writetable(table(calvec(i)),str_data,'WriteVariableNames',false,'Range',[xcol1 '1']);
    writetable(table({'Emp'}),str_data,'WriteVariableNames',false,'Range',[xcol2 '1']);
    writetable(table({'Emp/AggEmp'}),str_data,'WriteVariableNames',false,'Range',[xcol3 '1']);
    writetable(table(Company_Name_largest(:,i),emp_largest(:,i),emp_largest_normalized(:,i)),str_data,'WriteVariableNames',false,'Range',[xcol1 '2']);
end

% Save to Matlab file 
% Save the employment data

% Transpose the employment data so the matrices are years (T) by firms (N_largest)
emp_largest = emp_largest';
emp_largest_normalized = emp_largest_normalized';

str = [matdir 'FirmSize_top' num2str(N_largest) '_byYear.mat'];
save(str,'emp_largest','emp_largest_normalized','emp_aggregate','calvec','Company_Name_largest');







